异常点检测模型实验
一个小测试。
import sys
import time
# import logging
import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.tseries.offsets import *
# import plotly_express as px
# import cufflinks as cf
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
import cufflinks as cf
cf.set_config_file(offline=True, theme='pearl')
import talib as ta
import plotly_express as px
#collapse-show
df = pd.read_csv('./sample_21.txt')
df
df.columns = ['time', 'cash_amount', 'comment', 'channel', 'starter_id', 'starter_age', 'starter_area', 'receiver_id']
df.describe()
数据探索
这一部工作很适合用Tableau来做,使用Tableau分析的结果详见 《AliAntTest_数据探索及模型选择.pdf》。
模型的选择
风险监控问题的本质是分类问题,目前想到的可用模型有:
-
非监督学习——聚类;
在聚类效果上,谱聚类、DBscan的方法是非线性的,精度最优,但DBscan要探索一个阈值,灵活性低。
而谱聚类方法可以通过指定cluster个数来聚类,风险监控场景可以看作二分类场景,聚类个数一定是2,所以
谱聚类比较贴合这个场景。几种常用聚类方法的比较:

-
监督学习——深度学习模型;
监督学习需要有大量标注数据,在这个场景下显然是不适合的。
不过阿里应该不缺这些数据,比如用户在被盗、被骗之后大概率会联系阿里,告知哪些交易非本人操作,抑或哪些交易是被骗的,这些反馈可以比较准确的帮助阿里沉淀带标签的数据。
-
异常点检测
这一类算法是在风控模型里比较常用的一种方法,常用的是
Isolation Forest算法。
在这里为了照顾模型精度和开发工作量,选用两种聚类方法K-menas、谱聚类和Isolation Forest算法来构建模型。
Notes:特征提取方法:连续值用z-score归一化处理,离散值用one-hot方法做稀疏处理。
z_score_scaler = preprocessing.StandardScaler()
z_score_feature = z_score_scaler.fit_transform(df[['cash_amount', 'starter_age']])
features = pd.DataFrame(z_score_feature, columns=['cash_amount', 'starter_age'])
features
ch_one_hot_enc = preprocessing.OneHotEncoder()
ch_one_hot_enc.fit(df[['channel']])
# enc.n_values_
ch_one_hot_enc
res = ch_one_hot_enc.transform(df[['channel']]).toarray()
for i in range(res.shape[1]):
features[f'ch_{i}'] = res.T[0]
features
area_one_hot_enc = preprocessing.OneHotEncoder()
area_one_hot_enc.fit(df[['starter_area']])
area_one_hot_enc
res = area_one_hot_enc.transform(df[['starter_area']]).toarray()
for i in range(res.shape[1]):
features[f'area_{i}'] = res.T[0]
features
import ipyvolume as ipv
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3)
res = tsne.fit_transform(features)
df_res = pd.DataFrame(index=features.index)
for i in range(3):
df_res[i] = res[:, i]
df_res
df_res.columns = ['x', 'y', 'z']
fig = df_res.iplot(kind='scatter3d', x='x', y='y', z='z', size=8, colors=['pink'],
title='123',width=0.5, layout=dict(height=500, width=700, margin=dict(b=0, t=0)), opacity=1, asFigure=True)
# test_df.iplot(kind='scatter3d',x='x',y='y',z='z',size=15,categories='categories',text='text',
# title='Cufflinks - Scatter 3D Chart',colors=['blue','pink'],width=0.5,margin=(0,0,0,0),
# opacity=1, asFigure=True)
# fig = test_df.iplot(kind='scatter3d',x='x',y='y',z='z',size=15,categories='categories',text='text',
# title='Cufflinks - Scatter 3D Chart',colors=['blue','pink'],width=0.5,margin=(0,0,0,0),
# opacity=1, asFigure=True)
# fig.show()
from IPython.display import HTML
HTML(fig.to_html())
from matplotlib import cm
import ipyvolume as ipv
colormap = cm.Spectral
# _c = [i*3 for i in pd.Categorical(df_res['id']).codes]
# color = colormap(_c)
ipv.quickscatter(df_res[0], df_res[1], df_res[2], size=1) # , marker="sphere", color=color[:,:3]

从降维后的结果可以看到:这种方法提取的特征无法区分数据。
f = {'cash_amount': ['count', 'sum']}
starter_df = df.groupby('starter_id').aggregate(f)
starter_df.columns = ['_'.join(col).strip() for col in starter_df.columns.values]
starter_df.columns = ['s_cash_amount_count', 's_cash_amount_sum']
starter_df
f = {'cash_amount': ['count', 'sum']}
receiver_df = df.groupby('receiver_id').aggregate(f)
receiver_df.columns = ['_'.join(col).strip() for col in receiver_df.columns.values]
receiver_df.columns = ['e_cash_amount_count', 'e_cash_amount_sum']
receiver_df
features = receiver_df.join(starter_df, how='outer')
# 填充NaN
features = features.fillna(0)
features
z_score_scaler = preprocessing.StandardScaler()
z_score_feature = z_score_scaler.fit_transform(features)
_features = pd.DataFrame(z_score_feature, index=features.index)
_features
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3)
_res = tsne.fit_transform(_features)
df_res = pd.DataFrame(index=_features.index)
for i in range(3):
df_res[i] = _res[:, i]
df_res.reset_index(inplace=True)
df_res
from matplotlib import cm
import ipyvolume as ipv
colormap = cm.Spectral
# _c = [i*3 for i in pd.Categorical(df_res['id']).codes]
# color = colormap(_c)
ipv.quickscatter(df_res[0], df_res[1], df_res[2], size=1) # , marker="sphere", color=color[:,:3]

从降维后的结果可以看到:这种方法提取的特征很容易异常数据。
%%time
from sklearn.cluster import KMeans
y_pred = KMeans(n_clusters=2, random_state=10).fit_predict(_features)
from matplotlib import cm
import ipyvolume as ipv
df_res = _features.reset_index()
# pd.Categorical(y_pred).codes
colormap = cm.Spectral
_c = [i*100 for i in pd.Categorical(y_pred).codes]
color = colormap(_c)
ipv.quickscatter(df_res[0], df_res[1], df_res[3], size=2, marker="sphere", color=color[:,:3])

_df = pd.DataFrame(y_pred, index=_features.index)
features.loc[_df[_df[0]==1].index]
耗时39ms,但效果很差。
%%time
from sklearn.cluster import SpectralClustering
y_pred = SpectralClustering(n_clusters=2).fit_predict(_features)
from matplotlib import cm
import ipyvolume as ipv
df_res = _features.reset_index()
# pd.Categorical(y_pred).codes
colormap = cm.Spectral
_c = [i*100 for i in pd.Categorical(y_pred).codes]
color = colormap(_c)
ipv.quickscatter(df_res[0], df_res[1], df_res[3], size=2, marker="sphere", color=color[:,:3])
_df = pd.DataFrame(y_pred, index=_features.index)
_df[_df[0]==1]
features.loc[_df[_df[0]==1].index]

耗时6.46s,效果很好。
异常数据为:

features = _features
%%time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from scipy import stats
rng = np.random.RandomState(42)
# 构造训练样本
n_samples = features.shape[0] #样本总数
outliers_fraction = 1/n_samples #异常样本比例
clf = IsolationForest(max_samples=n_samples , random_state=rng, contamination=outliers_fraction) # , random_state=rng, contamination=outliers_fraction
clf.fit(features)
# y_pred_train = clf.predict(X_train)
scores_pred = clf.decision_function(features)
threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction) #根据训练样本中异常样本比例,得到阈值,用于绘图
scores_pred
threshold
_df = pd.DataFrame(scores_pred)
res = _df[_df[0] < threshold]
features.iloc[res.index]
效果:耗时616ms,结果准确。